rm(list = ls())
library("DESeq2")
library("tidyverse")
library("EnhancedVolcano")
library("ggpubr")
library("scales")
library("grid")
p_repo <- "~/projects-etc/2022_transcriptome-construction"
p_res <- "results/2022-1025"
p_txt <- "Alison/txt"
p_full <- paste(p_repo, p_res, p_txt, sep = "/")
dir.exists(p_full)
p_Alison <- gsub('(.*)/\\w+', '\\1', p_full)
dir.exists(p_Alison)
p_save <- paste(p_repo, p_res, sep = "/")
countData_Nascent <- read.delim(
paste(p_full, "mRNA_Nascent_Nab3.txt", sep = "/"),
header = TRUE
)
metadata_Nascent <- read.delim(
paste(p_full, "deletionmeta.txt", sep = "/"),
header = TRUE
)
dds_n <- DESeq2::DESeqDataSetFromMatrix(
countData=countData_Nascent,
colData=metadata_Nascent,
design= ~ dex,
tidy = TRUE
)
dds_n
# class: DESeqDataSet
# dim: 6600 5
# metadata(1): version
# assays(1): counts
# rownames(6600): Q0010 Q0017 ... YPR204C-A YPR204W
# rowData names(0):
# colnames(5): Q_6125_N Q_7718_N Q_6126_N Q_7716_N Q_7714_N
# colData names(2): id dex
BiocGenerics::sizeFactors(dds_n) <- c(1, 1.188, 1, 2.097, 1.662)
#QUESTION Is this where we want the estimated size
factors to be input? #QUESTION Is the current means of
calculating them appropriate for the DESeq2 model?
dds_n <- DESeq2::DESeq(dds_n)
res_n <- DESeq2::results(dds_n)
DESeq2::results(dds_n, tidy = TRUE) %>% head()
summary(res_n)
# out of 6488 with nonzero total read count
# adjusted p-value < 0.1
# LFC > 0 (up) : 2111, 33%
# LFC < 0 (down) : 872, 13%
# outliers [1] : 1, 0.015%
# low counts [2] : 0, 0%
# (mean count < 0)
# [1] see 'cooksCutoff' argument of ?results
# [2] see 'independentFiltering' argument of ?results
res_n <- res_n[order(res_n$padj), ]
head(res_n)
EnhancedVolcano::EnhancedVolcano(
res_n,
lab = NA,
x = 'log2FoldChange',
y = 'pvalue'
)
dir.exists(paste(p_save, "png", sep = "/")) ||
dir.create(paste(p_save, "png", sep = "/"))
png <- EnhancedVolcano::EnhancedVolcano(
res_n,
lab = NA,
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'),
title = 'Sense Nascent Transcription',
gridlines.major = FALSE,
gridlines.minor = FALSE,
pCutoff = 0.05,
FCcutoff = 2
)
timestampedFilename <- paste0(
paste(p_save, "png", sep = "/"), "/",
"sense_differential_nascent_expression", ".",
format(Sys.time(), format = "%F_%H%M%S"), ".png"
)
ggplot2::ggsave(timestampedFilename, plot = png)
dir.exists(paste(p_save, "txt", sep = "/")) ||
dir.create(paste(p_save, "txt", sep = "/"))
p_txt <- paste(p_save, "txt", sep = "/")
write.table(
tibble::rownames_to_column(as.data.frame(res_n), "gene"),
file = paste(p_txt, "nascent_mRNA_diff_res.txt", sep = "/"),
sep = "\t",
row.names = FALSE
)
write.table(
tibble::rownames_to_column(
as.data.frame(res_n[order(res_n$padj), ]), "gene"
),
file = paste(p_txt, "nascent_mRNA_diff_res_ordered.txt", sep = "/"),
sep = "\t",
row.names = FALSE
)
#QUESTION I then go in excel and name the gene category
“name” Why does R not name this column? I wish I knew
#ANSWER write.table is printing the row
names which, by default, don’t have column names; the row names become
the first column for all lines after the initial one; see the way I
addressed this above
#COMMENT There has gotta be a better way to do this
#ANSWER There is: see below
res1_n <- tibble::rownames_to_column(
as.data.frame(res_n[order(res_n$padj), ]), "gene"
)
#filter results
res1Nab3_up <- dplyr::filter(res1_n, log2FoldChange > 2, padj < 0.05)
res1Nab3_down <- dplyr::filter(res1_n, log2FoldChange < -2, padj < 0.05)
#write filtered results for go terms
write.table(
res1Nab3_up,
file = paste(p_txt, "res1Nab3_up_SENSE.txt", sep = "/"),
sep = "\t",
row.names = FALSE
)
write.table(
res1Nab3_down,
file = paste(p_txt, "res1Nab3_down_SENSE.txt", sep = "/"),
sep = "\t",
row.names = FALSE
)
#filter results
res1Nab3_up_1p5 <- filter(res1_n, log2FoldChange > 1.5, padj < 0.05)
res1Nab3_down_1p5 <- filter(res1_n, log2FoldChange < -1.5, padj < 0.05)
#read in AS count and metadata
AS_countData_Nascent <- read.delim(
paste(p_full, "AS_mRNA_Nascent_Nab3.txt", sep = "/"),
header = TRUE
)
AS_metadata_Nascent <- read.delim(
paste(p_full, "deletionmeta_AS.txt", sep = "/"),
header = TRUE
)
dds_n_as<- DESeq2::DESeqDataSetFromMatrix(
countData = AS_countData_Nascent,
colData = AS_metadata_Nascent,
design= ~ dex,
tidy = TRUE
)
dds_n_as
#size factors #from spike in
BiocGenerics::sizeFactors(dds_n_as) <- c(1, 1.188, 1, 2.097, 1.662)
dds_n_as <- DESeq2::DESeq(dds_n_as)
res_n_as<- DESeq2::results(dds_n_as)
DESeq2::results(dds_n_as, tidy=TRUE) %>% head()
summary(res_n_as)
##Sort Summary List by p-value
res_n_as <- res_n_as[order(res_n_as$padj), ]
head(res_n_as)
#make volcano plot - saves png
png <- EnhancedVolcano::EnhancedVolcano(
res_n_as,
lab = NA,
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'),
title = 'Antisense Nascent Transcription',
gridlines.major = FALSE,
gridlines.minor = FALSE,
pCutoff = 0.05,
FCcutoff = 2
)
timestampedFilename <- paste0(
paste(p_save, "png", sep = "/"), "/",
"AS_differential_nascent_expression", ".",
format(Sys.time(), format = "%F_%H%M%S"), ".png"
)
ggplot2::ggsave(timestampedFilename, plot = png)
#write results file AS
write.table(
tibble::rownames_to_column(as.data.frame(res_n_as), "gene"),
file = paste(p_txt, "AS_Nascent_mRNA_diff_res_sf.txt", sep = "/"),
sep = "\t",
row.names = FALSE
)
write.table(
tibble::rownames_to_column(
as.data.frame(res_n_as[order(res_n_as$padj), ]), "gene"
),
file = paste(p_txt, "AS_Nascent_mRNA_diff_res_sf_ordered.txt", sep = "/"),
sep = "\t",
row.names = FALSE
)
res1_n_as <- tibble::rownames_to_column(
as.data.frame(res_n_as[order(res_n_as$padj), ]), "gene"
)
#filter AS results
AS_res1Nab3_up <- dplyr::filter(res1_n_as, log2FoldChange > 2, padj < 0.05)
AS_res1Nab3_down <- dplyr::filter(res1_n_as, log2FoldChange < -2, padj < 0.05)
#write AS to file
write.table(
AS_res1Nab3_up,
file = paste(p_txt, "AS_res1Nab3_up.txt", sep = "/"),
sep = "\t",
row.names = FALSE
)
write.table(
AS_res1Nab3_down,
file = paste(p_txt, "AS_res1Nab3_down.txt", sep = "/"),
sep = "\t",
row.names = FALSE
)
#filter AS results different
AS_res1Nab3_up_1p5 <- dplyr::filter(res1_n_as, log2FoldChange > 1.5, padj < 0.05)
AS_res1Nab3_down_1p5 <- dplyr::filter(res1_n_as, log2FoldChange < -1.5, padj < 0.05)
#overlap between sense and AS by name
Nas_AS_up <- dplyr::inner_join(
AS_res1Nab3_up,
res1Nab3_up,
by = "gene"
)
Nas_AS_down <- dplyr::inner_join(
AS_res1Nab3_down,
res1Nab3_down,
by = "gene"
)
Nas_AS_down_up <- dplyr::inner_join(
AS_res1Nab3_down,
res1Nab3_up,
by = "gene"
)
Nas_AS_up_down <- dplyr::inner_join(
AS_res1Nab3_up,
res1Nab3_down,
by = "gene"
)
#overlap between sense and AS by name - different gates
Nas_1.5_AS_up <- dplyr::inner_join(
AS_res1Nab3_up_1p5,
res1Nab3_up_1p5,
by = "gene"
)
Nas_1.5_AS_down <- dplyr::inner_join(
AS_res1Nab3_down_1p5,
res1Nab3_down_1p5,
by = "gene"
)
Nas_1.5_AS_down_up <- dplyr::inner_join(
AS_res1Nab3_down_1p5,
res1Nab3_up_1p5,
by = "gene"
)
Nas_1.5_AS_up_down <- dplyr::inner_join(
AS_res1Nab3_up_1p5,
res1Nab3_down_1p5,
by = "gene"
)
#read in AS of Nab3 TPM - from replicate analysis rmd
AS_TPM <- read.delim(
paste(p_full, "AS_TPM.txt", sep = "/"),
header = TRUE
)
#calc average TPM
AS_TPM <- AS_TPM %>%
dplyr::rowwise() %>%
dplyr::mutate(
Avg_single_Tag_AS_TPM = mean(c(Q_7718_N_AS, Q_7718_N_AS), na.rm = T)
)
AS_TPM <- AS_TPM %>%
dplyr::rowwise() %>%
dplyr::mutate(
Avg_parental_AS_TPM = mean(c(Q_6125_N_AS, Q_6126_N_AS), na.rm = T)
)
#extract only relevent columns
AS_TPM_Clean <- AS_TPM %>%
dplyr::select(c("name", "Avg_single_Tag_AS_TPM", "Avg_parental_AS_TPM"))
# Change the "name" column to "gene" column
colnames(AS_TPM_Clean)[colnames(AS_TPM_Clean) == "name"] <- "gene"
#access quantiles
quantile(AS_TPM_Clean$Avg_single_Tag_AS_TPM)
quantile(AS_TPM_Clean$Avg_parental_AS_TPM)
#more fine grain
quantile(AS_TPM_Clean$Avg_single_Tag_AS_TPM, probs = c(0.8,0.85, .9, .95))
quantile(AS_TPM_Clean$Avg_parental_AS_TPM, probs = c(0.8,0.85, .9, .95))
#highest 20% of AS by absolute level
Cuttoff_TPM_AS <- dplyr::filter(AS_TPM_Clean, Avg_single_Tag_AS_TPM > 14)
#now were doing the intersection of AS of high level and differential expression
TPM_Nas_AS_up <- dplyr::inner_join(AS_res1Nab3_up, Cuttoff_TPM_AS, by = "gene")
nrow(TPM_Nas_AS_up)
#731 AS up
#overlap between TPM_Nas_AS_up and res1Nab3_down_1p5 (one way to gate functional AS)
TPM_AS_up_Sense_down <- dplyr::inner_join(
TPM_Nas_AS_up,
res1Nab3_down_1p5,
by = "gene"
)
nrow(TPM_AS_up_Sense_down)
nrow(res1Nab3_down_1p5)
#overlap between TPM_Nas_AS_up and res1Nab3_down (different way to gate functional AS)
TPM_AS_up_Sense_down_2 <- dplyr::inner_join(
TPM_Nas_AS_up,
res1Nab3_down,
by = "gene"
)
nrow(TPM_AS_up_Sense_down_2)
nrow(res1Nab3_down)
#rename AS table names so I can compare them with sense values - they will then have different names and labels
colnames(res1_n_as) <- c(
"gene", "AS_baseMean", "AS_log2FoldChange", "AS_lfcSE",
"AS_stat", "AS_pvalue", "AS_padj"
)
#one giant matrix of both sense and Antisense
Combinded_RES <- merge(res1_n, res1_n_as, by = "gene")
#graph logfoldchange of sense and Antisense #“pairwise.complete.obs” must be used to throw out incomplete values - I know some authors will change every zero to 1 to fix this
grob1 <- grid::grobTree(
grid::textGrob(
paste(
"Pearson Correlation: ",
round(cor(
Combinded_RES$log2FoldChange,
Combinded_RES$AS_log2FoldChange,
use = "pairwise.complete.obs"),
4)
),
x = 0.5,
y = 0.97,
hjust = 0,
gp = gpar(col = 'blue', fontsize = 11, fontface = "bold")
)
)
ggplot2::ggplot(data = Combinded_RES) +
ggplot2::geom_vline(xintercept = 0, alpha = .5) +
ggplot2::geom_hline(yintercept = 0, alpha = .5) +
ggplot2::geom_point(
data = Combinded_RES,
mapping = aes(y = log2FoldChange, x = AS_log2FoldChange),
alpha = 4/10,
size = 1,
color = 'black'
) +
ggplot2::geom_smooth(
aes(y = log2FoldChange, x = AS_log2FoldChange),
method = lm,
se = TRUE,
color = 'forestgreen'
) +
ggplot2::theme_bw() +
ggplot2::annotation_custom(grob1) +
ggplot2::geom_smooth(
data = Combinded_RES,
aes(y = log2FoldChange, x = AS_log2FoldChange),
method = lm,
se = TRUE,
color = 'blue'
)
#check colnames of TPM_Nas_AS_up
colnames(TPM_Nas_AS_up)
#rename AS table names so I can compare them with sense values - they will then have different names and labels
colnames(TPM_Nas_AS_up) <- c(
"gene", "AS_baseMean", "AS_log2FoldChange", "AS_lfcSE", "AS_stat",
"AS_pvalue", "AS_padj", "Avg_single_Tag_AS_TPM", "Avg_parental_AS_TPM"
)
#add sense expression info onto highest AS table so those points can be graphed seperately
High_up_AS <- merge(TPM_Nas_AS_up, res1_n, by = "gene")
#scatterplot now with high AS highlighted in red
grob2 <- grid::grobTree(
grid::textGrob(
paste(
"Pearson Correlation Over Expressed AS: ",
round(
cor(
High_up_AS$log2FoldChange,
High_up_AS$AS_log2FoldChange,
use = "pairwise.complete.obs"),
4
)
),
x= 0.3,
y = 0.93,
hjust = 0,
gp = gpar(col = 'red', fontsize = 11, fontface = "bold")
)
)
ggplot2::ggplot(data = Combinded_RES) +
ggplot2::geom_vline(xintercept = 0, alpha = .5) +
ggplot2::geom_hline(yintercept = 0, alpha = .5) +
ggplot2::geom_point(
data = Combinded_RES,
mapping = aes(y = log2FoldChange, x = AS_log2FoldChange),
alpha = 4/10,
size = 1,
color = 'black'
) +
ggplot2::geom_smooth(
aes(y = log2FoldChange, x = AS_log2FoldChange),
method = lm,
se = TRUE,
color = 'forestgreen'
) +
ggplot2::theme_bw() +
ggplot2::geom_point(
data = High_up_AS,
aes(y = log2FoldChange,x=AS_log2FoldChange),
color = "red",
size =1,
alpha = 4/10
)+
ggplot2::annotation_custom(grob1) +
ggplot2::annotation_custom(grob2) +
ggplot2::geom_smooth(
data = Combinded_RES,
aes(y = log2FoldChange, x = AS_log2FoldChange),
method = lm,
se = TRUE,
color = 'blue'
) +
ggplot2::geom_smooth(
data = High_up_AS,
aes(y = log2FoldChange, x = AS_log2FoldChange),
method = lm,
se = TRUE,
color = "red"
)
#Make a new variable that’s the same thing (need this to remove genes, so no gene is asigned to more than one group)
#now we are binning genes by group of mRNA expression #group 1 up genes #group 5 down genes
#group 3 “neutral”
#remove group 1 genes from DeSeq2_Sense_Results
nrow(DeSeq2_Sense_Results) + nrow(grp1_DeSeq2_Sense_Results)
[1] 6600
#remove group 5 genes from DeSeq2_Sense_Results
nrow(grp1_DeSeq2_Sense_Results) + nrow(grp5_DeSeq2_Sense_Results)
[1] 808
#remove group 3 genes from DeSeq2_Sense_Results
nrow(grp3_DeSeq2_Sense_Results)
[1] 4418
#sanity check count rows
nrow(grp5_DeSeq2_Sense_Results)
[1] 72
#now from not group 1,3,5 - split into group 2 and 4
#sanity check count rows
nrow(grp1_DeSeq2_Sense_Results) +
nrow(grp5_DeSeq2_Sense_Results) +
nrow(grp3_DeSeq2_Sense_Results) +
nrow(grp2_DeSeq2_Sense_Results) +
nrow(grp4_DeSeq2_Sense_Results)
[1] 6488
#sanity check count rows
nrow(grp1_DeSeq2_Sense_Results) + nrow(grp5_DeSeq2_Sense_Results)
[1] 808
#sanity check count rows #only thing left in “DeSeq2_Sense_Results” is NA results
nrow(DeSeq2_Sense_Results)
[1] 112
#now we have groups by sense expression change #now make table to integrate AS expression info
#what happens next is how i make violin plots. #I figured this out like a year ago and have copy pasted this code a bunch. #this is the bins
#now the little seperate bins are glued together
#now we we make a violin plot
#now we we make a violin plot with p values
#now were going to do the same thing #the bins stay the same #but in the bins were are putting AS expression level
colnames(TPM_group1)
[1] "gene" "baseMean" "log2FoldChange" "lfcSE" "stat" "pvalue"
[7] "padj" "Q_6125_N_AS" "Q_7718_N_AS" "Q_6126_N_AS" "Q_7716_N_AS" "length_kb"
[13] "ratio_6125" "ratio_6126" "Avg_single_Tag_AS_TPM" "Avg_parental_AS_TPM"
#making data frames to put into violin plot
#glueing the data frames into one
#make violin plot
#make violin plot but now with p values
#now I am gating by fold change >2 instead of 4
#comparing between fold change >2 instead of 4
nrow(ONE_TPM_Nas_AS_up)
[1] 878
#New Sense filters
#new functional RNA catagory
nrow(ONE_UpAS_DownMRNA)
[1] 187
#write names only to file #thanks for writing this Kris
close(fileConn)
Error in UseMethod("close") :
no applicable method for 'close' applied to an object of class "character"
#calc overlaps
nrow(ONE_res1Nab3_up)
[1] 1496
#HERE ##now I am going to compare to 5781 and 2 WT Q and
G1 #so I gotta read that data in
#filtered by expression
#calc what high AS is
quantile(WT_TPM$Avg_Q_IP_ANTISense, probs = c(0.5, 0.75, 0.8,0.85, 0.9, 0.95))
50% 75% 80% 85% 90% 95%
3.369446 10.541939 14.414169 21.709871 36.698259 79.249805
#once again I am gating at 80%
#check colnames
colnames(WT_high_level_AS)
[1] "ensgene" "s5781_G1_IP" "s5781_Q_IP" "s5782_G1_IP" "s5782_Q_IP" "s5781_G1_IP_AS" "s5781_Q_IP_AS"
[8] "s5782_G1_IP_AS" "s5782_Q_IP_AS" "length_kb" "Avg_Q_IP_Sense" "Avg_G1_IP_Sense" "Avg_Q_IP_ANTISense" "Avg_G1_IP_ANTISense"
#merge high level and overexpressed
#now compare WT to Nab3 experiment
nrow(High_up_AS)
[1] 731
#of the 180 above - how many are functional?
nrow(AS_tested)
[1] 180
#making a list of gene names from Nab3 data so I can highlight those genes in WT data
nrow(ONE_TPM_Nas_AS_up_list)
[1] 878
#all genes in WT - AS tpm and differential expression
#graph it and it is a blob
#map Nab3 genes to WT data
#graph it and it is 2 blobs!! :)
#graph it again
#all on one. busy, very hard to look at.